*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* Replication Files to: P-hacking in clinical trials and how incentives shape the distribution of results across phases 
* Date: April 2020
* Authors: Jérôme Adda, Christian Decker, and Marco Ottaviani
*
*	- Input: Raw Data & Interim Datasets from 2
* 			 		 
*	- Output: Interim Datasets & 'data_counterfactual_analysis_sec.dta'
*			 
* Topic: Generate dataset for robustness checks with secondary outcomes
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

*------------------------*
* Section 0 -- Directory *
*------------------------*

clear all
clear matrix
cap log close
set more off

*-----*
* 0.1 *
*-----*

* Run the dofiles which defines all the globals for the folder's and sub-folders
* paths to both data and analysis.

if c(username) == "chrdec" {													
	do "C:/Users/chrdec/Dropbox/clinical trials/stata/PNAS revision/2_analysis/dofiles/0_globals.do"   			// 0_globals.do path

}

*-----*
* 0.2 * 
*-----*

* Set up the directory where data are stored
macro list
cd "${data}"
set matsize 11000

cap log off
log using "${run}${log}log_7a.txt", replace

*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~

*-------------------*
* Table of content: *
*-------------------*
* Sections: 
* 1. Merge Tables I 
* 2. Generate Variables
* 3. Generate Rankings of Industry Sponsors by Firm Size
* 4. Merge Tables II & Restrict Sample
* 5. Generate Dummies for Fixed Effects



*-----------------------------*
* Section 1 -- Merge Tables I *
*-----------------------------*

use "${data}${inter}studies.dta", clear
keep nct_id start_date completion_date study_type overall_status phase enrollment 
merge 1:1 nct_id using "${data}${inter}lead_sponsors.dta", keepusing(agency_class name)
rename name agency
drop _merge

merge 1:m nct_id using "${data}${inter}results_merged.dta", keepusing(id non_inferiority_type p_value p_value_modifier p_value_description outcome_type)
drop _merge
rename id results_outcome_analysis_id
rename p_value p
gen p_provided=1
replace p_provided=0 if p<0 | p>1 | p==.
drop if p_provided==0
replace p_value_modifier="<" if p==0 & p_value_modifier!=">"
replace p=0.0001 if p==0 & p_value_modifier!=">"

merge m:1 nct_id using "${data}${inter}intervention_drug_list.dta"
keep if _merge==3
drop _merge


*---------------------------------*
* Section 2 -- Generate Variables *
*---------------------------------*

replace p_value_modifier="" if p_value_modifier=="," | p_value_modifier=="=" | p_value_modifier=="NS" | p_value_modifier=="=" | p_value_modifier=="â¤"
replace p_value_modifier=subinstr(p_value_modifier,"=","",10) 
replace p_value_modifier=subinstr(p_value_modifier,"p","",10) 
replace p_value_modifier=subinstr(p_value_modifier,",","",10) 
gen z_modifier=0
replace z_modifier=1 if p_value_modifier=="<"
replace z_modifier=-1 if p_value_modifier==">"

gen z=-invnormal(p/2)

gen start_year=substr(start_date,1,4)
destring start_year, replace
gen start_month=substr(start_date,6,2)
destring start_month, replace

gen completion_year=substr(completion_date,1,4)
destring completion_year, replace
gen completion_month=substr(completion_date,6,2)
destring completion_month, replace

gen Ph2=0
gen Ph3=0
replace Ph2=1 if phase=="Phase 2"
replace Ph3=1 if phase=="Phase 3"

gen INDUSTRY=0
replace INDUSTRY=1 if agency_class=="Industry"

gen NI=0
replace NI=1 if non_inferiority_type=="Non-Inferiority or Equivalence"

gen PRIMARY=0
replace PRIMARY=1 if outcome_type=="Primary"

keep if study_type=="Interventional"


*------------------------------------------------------------------*
* Section 3 -- Generate Rankings of Industry Sponsors by Firm Size *
*------------------------------------------------------------------*

** ranking by revenue 2018
gen rev_rank=.
replace rev_rank=1 if regexm(agency,"Johnson & Johnson") | regexm(agency,"Janssen Research & Development")
replace rev_rank=2 if regexm(agency,"AbbVie") | regexm(agency,"Abbott")
replace rev_rank=3 if regexm(agency,"Hoffmann-La Roche")
replace rev_rank=4 if regexm(agency,"Pfizer")
replace rev_rank=5 if regexm(agency,"Novartis")
replace rev_rank=6 if regexm(agency,"Bayer")
replace rev_rank=7 if regexm(agency,"GlaxoSmithKline") | regexm(agency,"GSK")
replace rev_rank=8 if regexm(agency,"Merck Sharp & Dohme Corp.")
replace rev_rank=9 if regexm(agency,"Sanofi")
replace rev_rank=10 if regexm(agency,"Eli Lilly")
replace rev_rank=11 if regexm(agency,"Amgen")
replace rev_rank=12 if regexm(agency,"Bristol-Myers Squibb")
replace rev_rank=13 if regexm(agency,"Gilead")
replace rev_rank=14 if regexm(agency,"AstraZeneca")
replace rev_rank=15 if regexm(agency,"Danaher") //no trials found??
replace rev_rank=16 if regexm(agency,"Boehringer Ingelheim")
replace rev_rank=17 if regexm(agency,"Takeda")
replace rev_rank=18 if regexm(agency,"Teva")
replace rev_rank=19 if regexm(agency,"Novo Nordisk")
replace rev_rank=20 if regexm(agency,"Merck KGaA")
*source: https://en.wikipedia.org/wiki/List_of_largest_biomedical_companies_by_revenue (collected from official financial statements)

gen top10rev=0
replace top10rev=1 if rev_rank<=10


** ranking by prescription sales 2018
gen rxsales_rank=.
replace rxsales_rank=1 if regexm(agency,"Pfizer")
replace rxsales_rank=2 if regexm(agency,"Hoffmann-La Roche")
replace rxsales_rank=3 if regexm(agency,"Novartis")
replace rxsales_rank=4 if regexm(agency,"Johnson & Johnson") | regexm(agency,"Janssen Research & Development")
replace rxsales_rank=5 if regexm(agency,"Merck Sharp & Dohme Corp.")
replace rxsales_rank=6 if regexm(agency,"AbbVie") | regexm(agency,"Abbott")
replace rxsales_rank=7 if regexm(agency,"Sanofi")
replace rxsales_rank=8 if regexm(agency,"GlaxoSmithKline") | regexm(agency,"GSK")
replace rxsales_rank=9 if regexm(agency,"Amgen")
replace rxsales_rank=10 if regexm(agency,"Gilead")
replace rxsales_rank=11 if regexm(agency,"Bristol-Myers Squibb")
replace rxsales_rank=12 if regexm(agency,"AstraZeneca")
replace rxsales_rank=13 if regexm(agency,"Eli Lilly")
replace rxsales_rank=14 if regexm(agency,"Bayer")
replace rxsales_rank=15 if regexm(agency,"Novo Nordisk")
replace rxsales_rank=16 if regexm(agency,"Takeda")
replace rxsales_rank=17 if regexm(agency,"Celgene")
replace rxsales_rank=18 if regexm(agency,"Shire")
replace rxsales_rank=19 if regexm(agency,"Boehringer Ingelheim")
replace rxsales_rank=20 if regexm(agency,"Allergan")
*source: Pharmaceutical Executive June 2019 (based on data from EvaluatePharma)


** ranking by RD spending 2018
gen rdspend_rank=.
replace rdspend_rank=1 if regexm(agency,"Hoffmann-La Roche")
replace rdspend_rank=2 if regexm(agency,"Johnson & Johnson") | regexm(agency,"Janssen Research & Development")
replace rdspend_rank=3 if regexm(agency,"Novartis")
replace rdspend_rank=4 if regexm(agency,"Pfizer")
replace rdspend_rank=5 if regexm(agency,"Merck Sharp & Dohme Corp.")
replace rdspend_rank=6 if regexm(agency,"Sanofi")
replace rdspend_rank=7 if regexm(agency,"AbbVie") | regexm(agency,"Abbott")
replace rdspend_rank=8 if regexm(agency,"AstraZeneca")
replace rdspend_rank=9 if regexm(agency,"Bristol-Myers Squibb")
replace rdspend_rank=10 if regexm(agency,"Eli Lilly")
replace rdspend_rank=11 if regexm(agency,"GlaxoSmithKline") | regexm(agency,"GSK")
replace rdspend_rank=12 if regexm(agency,"Celgene")
replace rdspend_rank=13 if regexm(agency,"Gilead")
replace rdspend_rank=14 if regexm(agency,"Amgen")
replace rdspend_rank=15 if regexm(agency,"Bayer")
replace rdspend_rank=16 if regexm(agency,"Boehringer Ingelheim")
replace rdspend_rank=17 if regexm(agency,"Takeda")
replace rdspend_rank=18 if regexm(agency,"Biogen")
replace rdspend_rank=19 if regexm(agency,"Novo Nordisk")
replace rdspend_rank=20 if regexm(agency,"Regeneron")
*source: Pharmaceutical Executive June 2019 (based on data from EvaluatePharma)

** ranking by no. relevant trials reported
gen rep_rank=.
replace rep_rank=1 if regexm(agency,"GlaxoSmithKline") | regexm(agency,"GSK") //with subsidies
replace rep_rank=2 if regexm(agency,"Pfizer") //with subsidies
replace rep_rank=3 if regexm(agency,"Merck Sharp & Dohme Corp.") //!= Merck KgaA
replace rep_rank=4 if regexm(agency,"Eli Lilly")
replace rep_rank=5 if regexm(agency,"Boehringer Ingelheim")
replace rep_rank=6 if regexm(agency,"AstraZeneca")
replace rep_rank=7 if regexm(agency,"Hoffmann-La Roche")
replace rep_rank=8 if regexm(agency,"Novartis")
replace rep_rank=9 if regexm(agency,"Takeda")
replace rep_rank=10 if regexm(agency,"Shire")
replace rep_rank=11 if regexm(agency,"Amgen")
replace rep_rank=12 if regexm(agency,"Bayer")
replace rep_rank=13 if regexm(agency,"Sanofi")
replace rep_rank=14 if regexm(agency,"Johnson & Johnson") | regexm(agency,"Janssen Research & Development")
replace rep_rank=15 if regexm(agency,"Gilead")
replace rep_rank=16 if regexm(agency,"Bristol-Myers Squibb")
replace rep_rank=17 if regexm(agency,"Otsuka") //with 2 different names
replace rep_rank=18 if regexm(agency,"AbbVie") | regexm(agency,"Abbott")
replace rep_rank=19 if regexm(agency,"Novo Nordisk")
replace rep_rank=20 if regexm(agency,"Merck KGaA")


*------------------------------------------------*
* Section 4 -- Merge Tables II & Restrict Sample *
*------------------------------------------------*

merge m:1 nct_id using "${data}${inter}placebo_list.dta"
drop if _merge==2
drop _merge

keep if PRIMARY==0
keep if NI==0
replace z_modifier=0 if z_modifier==-1 & z==0

count if agency=="Colgate Palmolive" & (Ph2==1 | Ph3==1)
count if agency=="Colgate Palmolive" & p==.05 & (Ph2==1 | Ph3==1)
keep if agency!="Colgate Palmolive"

drop if nct_id=="NCT02799472" //drop trial with 211 primary outcomes

merge m:1 nct_id using "${data}${inter}nct_market_size.dta"
drop if _merge==2
drop _merge

unique nct_id if (Ph2==1 | Ph3==1) & mkt_size==.
replace mesh_code="missing" if mesh_code==""

egen aux=mean(mkt_size)
replace mkt_size=aux if mkt_size==.
drop aux
replace mkt_size=. if mkt_size==0
egen aux=min(mkt_size)
replace mkt_size=aux if mkt_size==.
drop aux


gen log_mkt=log(mkt_size)
gen D1=0
replace D1=1 if p==0.001 & z_modifier==1
gen D2=0
replace D2=1 if p==0.0001 & z_modifier==1
gen D0=1-D1-D2
gen sqrt_enrollment=sqrt(enrollment)


*-------------------------------------------------*
* Section 5 -- Generate Dummies for Fixed Effects *
*-------------------------------------------------*

tab completion_year
levelsof completion_year if completion_year>2006 & completion_year<2019, local(cys) // omitted category: before 2007
foreach y of local cys {
	gen cy_`y'=0
 	replace cy_`y'=1 if completion_year==`y'
}
replace cy_2018=1 if completion_year>2018 // cy_2018 means 2018 or later

tab mesh_code
gen mesh_aux=subinstr(mesh_code,"/","_",10)
levelsof mesh_aux if mesh_code!="C01" & mesh_code!="C02" & mesh_code!="C03" & mesh_code!="C11" & mesh_code!="C15" & mesh_code!="C24" & mesh_code!="C26" & mesh_code!="missing", local(code) clean // omitted category: missing/others (missing, C01,C02,C03,C11,C15,C24,C26)
foreach c of local code {
	gen mc_`c'=0
 	replace mc_`c'=1 if mesh_aux=="`c'"
}
drop mesh_aux

save "${data}${final}data_counterfactual_analysis_sec.dta", replace

log close 
cd "${run}${dofiles}"
